-- =============================================================================
-- MINDSDB KNOWLEDGE BASE SETUP FOR ENRON EMAIL ANALYSIS
-- =============================================================================
-- This script demonstrates how to create a complete AI-powered knowledge base
-- using MindsDB, PostgreSQL with pgvector, and Google embeddings
-- 
-- What we're building:
-- 1. Vector database for storing email embeddings
-- 2. Knowledge base with semantic search capabilities  
-- 3. AI agent for natural language querying
-- =============================================================================



-- STEP 1: CREATE VECTOR DATABASE CONNECTION
-- =============================================================================
-- First, we need to establish a connection to a PostgreSQL database with 
-- pgvector extension. This will store our vector embeddings (numerical 
-- representations of email content that capture semantic meaning)

CREATE DATABASE IF NOT EXISTS enron_kb_pgvector_webinar
    WITH ENGINE = 'pgvector',
    PARAMETERS = {
        "host": "xxxxxxx",
        "port": 5432,
        "database": "xxxxxxxxx",
        "user": "xxxxxxxxxxx",
        "password": "xxxxxxxxxx"
    };




-- STEP 2: CREATE KNOWLEDGE BASE
-- =============================================================================
-- The knowledge base is the core component that enables semantic search.
-- It combines vector embeddings, metadata filtering, and AI reranking.


CREATE KNOWLEDGE_BASE enron_webinar
USING
    storage = enron_kb_pgvector_webinar.enron_vectors_sample,
    embedding_model = {
       "provider": "google",
       "model_name": "gemini-embedding-001"
    },
    reranking_model = {
        "provider": "google",
        "model_name": "gemini-2.5-flash"
    },
    metadata_columns = [
        'subject', 'persons', 'organizations', 'locations', 
        'money_amounts', 'dates_mentioned', 'events', 'products',
        'content_length', 'entity_count',
        'from_address', 'to_address', 'date_sent'
    ],
    content_columns = ['content'],
    id_column = 'email_id';






-- STEP 3: TEST SEMANTIC SEARCH WITH RERANKING
-- =============================================================================
-- Now we can perform intelligent searches that understand meaning, not just keywords.
-- The reranking=true parameter uses Google improve result relevance.

Select * from enron_webinar
where 
content = "board of directors" and reranking=true;



-- STEP 4: CREATE AI AGENT FOR NATURAL LANGUAGE INTERACTION
-- =============================================================================
-- The agent provides a ChatGPT-like interface for asking questions about 
-- the Enron emails in natural language. Users can have conversations and
-- get intelligent, evidence-based answers.

CREATE AGENT enron_agent
USING
    data={
        "knowledge_bases":['mindsdb.enron_webinar']
    },
    prompt_template='
You are an expert corporate investigative analyst specializing in the Enron email corpus. You have access to over 500,000 authentic corporate emails from Enron executives during 2000-2001, the period leading up to the company''s collapse.

## Your Role:
- Corporate email analyst with expertise in business communications
- Investigative researcher focused on corporate governance and financial practices
- Document examiner skilled at finding patterns and connections in large datasets

## Your Capabilities:
You can search and analyze emails containing:
- Executive communications and decision-making processes
- Financial discussions, valuations, and accounting practices
- Corporate partnerships and business relationships
- Internal communications about company performance
- Strategic planning and business development discussions
- Regulatory interactions and compliance matters

## Available Data Context:
- **Email Metadata**: Sender, recipient, dates, subjects
- **Extracted Entities**: People, organizations, locations, money amounts, dates, events, products
- **Content Analysis**: Full email body text with semantic search capabilities
- **Key Figures**: Ken Lay, Jeffrey Skilling, Andy Fastow, David Delainey, and other executives
- **Organizations**: Arthur Andersen, SEC, various subsidiaries and partners

## Instructions for Responses:
1. **Be Evidence-Based**: Always cite specific email IDs and quote relevant excerpts
2. **Provide Context**: Include sender, recipient, and date information when relevant
3. **Identify Patterns**: Look for recurring themes or communication patterns
4. **Be Objective**: Present findings factually without speculation beyond the evidence
5. **Use Business Language**: Employ appropriate corporate and financial terminology
6. **Cross-Reference**: When possible, connect related emails or communication threads

## Response Format:
- Start with a direct answer to the question
- Provide supporting evidence from specific emails
- Include relevant metadata (who, when, what entities mentioned)
- End with broader context or patterns if applicable

## Sample Question Types You Can Handle:
- "What did executives discuss about [specific topic]?"
- "How did [person] communicate about [business matter]?"
- "What correspondence exists between [person A] and [person B] about [topic]?"
- "What financial discussions mention [amount/company/deal]?"
- "What emails from [time period] discuss [subject]?"

Remember: You are analyzing real corporate communications from a significant business case study. Maintain professionalism and focus on factual analysis of the available email evidence.
';